import pandas as pd
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
#model
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
#Bagging
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore")
#Boosting
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier
#Tuning
from time import time
from scipy.stats import randint as sp_randint
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import load_digits
from sklearn.pipeline import Pipeline, make_pipeline
bank = pd.read_csv("BankChurners.csv") # use copy to store the original data
data = bank.copy()
data.head()
print(f"The data has {data.shape[0]} rows and {data.shape[1]} columns")
data.info()
Data does not have any missing values
Data Dictionary:
data.drop(["CLIENTNUM"], axis=1, inplace = True) # Drop customer unique info, we are not using this
data.describe().T
data.nunique().sort_values(ascending=False)
For the memory purpose, Convert object to category.
to_category = ['Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category', 'Gender',
'Total_Relationship_Count', 'Months_Inactive_12_mon', 'Contacts_Count_12_mon', 'Dependent_count']
for col in to_category:
data[col]=data[col].astype('category')
Since the target value (Attrition_Flag) has only 2 categories, It is reasonable to convert them to 0 and 1
data['Attrition_Flag'] = data['Attrition_Flag'].replace('Attrited Customer',0)
data['Attrition_Flag'] = data['Attrition_Flag'].replace('Existing Customer',1)
data.info()
all_col = ['Attrition_Flag','Customer_Age','Gender','Dependent_count','Education_Level','Marital_Status',
'Income_Category','Card_Category','Months_on_book','Total_Relationship_Count','Months_Inactive_12_mon',
'Contacts_Count_12_mon','Credit_Limit','Total_Revolving_Bal','Avg_Open_To_Buy','Total_Amt_Chng_Q4_Q1',
'Total_Trans_Amt','Total_Trans_Ct','Total_Ct_Chng_Q4_Q1','Avg_Utilization_Ratio']
for col in all_col:
print(f"{col} = {data[col].unique()}")
print()
small_num_col = ['Customer_Age','Credit_Limit','Total_Revolving_Bal','Avg_Open_To_Buy','Total_Amt_Chng_Q4_Q1',
'Total_Trans_Amt','Total_Trans_Ct','Total_Ct_Chng_Q4_Q1','Avg_Utilization_Ratio']
big_num_col = ['Dependent_count','Months_on_book','Total_Relationship_Count','Months_Inactive_12_mon',
'Contacts_Count_12_mon']
cat_col = ['Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category', 'Gender', 'Attrition_Flag']
for i in small_num_col:
sns.distplot(data[i])
plt.show()
for i in big_num_col:
sns.countplot(data[i])
plt.show()
for i in cat_col:
sns.countplot(data[i])
plt.show()
Although there is no Nan values, unknown values are considered to be missing values. However, since they are still accounted and recorded in data collecting process, unknown data may represent as another category (may reflect customers' personalities etc.). Hence, no missing treatment handling is done.
After soft EDA, couple of errors/outliers were detected.
data[data['Credit_Limit']>33000]
data[data['Months_on_book']==36]
data[data['Total_Amt_Chng_Q4_Q1']>3]
data[data['Total_Ct_Chng_Q4_Q1']>3]
data=data.drop([1,8,12,269,773])
data.corr()
plt.subplots(figsize=(10,10))
sns.heatmap(data.corr(), annot =True, linewidth=1)
sns.pairplot(data)
for i in small_num_col:
plt.figure(figsize=(10,5))
sns.violinplot(x=data['Attrition_Flag'], y=data[i])
plt.show()
for i in cat_col:
sns.countplot(data['Attrition_Flag'], hue = data[i])
plt.show()
X = data.drop(['Attrition_Flag'],axis=1)
X = pd.get_dummies(X,drop_first=True)
y = data['Attrition_Flag']
x_train, x_test, y_train, y_test =train_test_split(X, y, test_size=0.3, random_state=1,stratify=y)
print(x_train.shape, x_test.shape)
print(f"Class distritution: {y.value_counts(1)}")
print("")
print("Test Class distritution: {y_test.value_counts(1)}") # good distribution
def get_score_acc_rec_prec(model):
'''
model : Classifier to predict
[0:train_acc,1:test_acc,2:train_recall,3:test_recall,4:train_precision,5test_precision]
'''
list = [] # defining an empty list to store all scores
pred_train = model.predict(x_train)
pred_test = model.predict(x_test)
#Create Scores
train_acc = model.score(x_train,y_train)
test_acc = model.score(x_test,y_test)
train_recall = metrics.recall_score(y_train,pred_train)
test_recall = metrics.recall_score(y_test,pred_test)
train_precision = metrics.precision_score(y_train,pred_train)
test_precision = metrics.precision_score(y_test,pred_test)
# Add accuracy in the list
list.append(train_acc)
list.append(test_acc)
list.append(train_recall)
list.append(test_recall)
list.append(train_precision)
list.append(test_precision)
return list # returning the list with train and test scores
def print_score(list,what="all"):
''''''
if what == "acc":
print("Accuracy on training set : ", list[0])
print("Accuracy on test set : ", list[1])
elif what == "rec":
print("Recall on training set : ", list[2])
print("Recall on test set : ", list[3])
elif what == "prec":
print("Precision on training set : ", list[4])
print("Precision on test set : ", list[5])
else:
print("Accuracy on training set : ", list[0])
print("Accuracy on test set : ", list[1])
print("Recall on training set : ", list[2])
print("Recall on test set : ", list[3])
print("Precision on training set : ", list[4])
print("Precision on test set : ", list[5])
def make_cm(model,y_actual,labels=[1, 0]):
'''
model : classifier to predict
y_actual : ground truth (y_test)
'''
y_predict = model.predict(x_test)
cm=metrics.confusion_matrix( y_actual, y_predict, labels=[1, 0])
df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
columns = [i for i in ["Predict 1","Predict 0"]])
# annotation=[cm]
# [['1116\n76.18%' '73\n4.98%']
# ['91\n6.21%' '185\n12.63%']]
sns.heatmap(df_cm, annot=True, fmt='')
lg = LogisticRegression(random_state=1)
lg.fit(x_train,y_train)
make_cm(lg,y_test)
lg_score = get_score_acc_rec_prec(lg)
print_score(lg_score,'all')
dTree = DecisionTreeClassifier(criterion='gini',random_state=1)
dTree.fit(x_train, y_train)
make_cm(dTree,y_test)
dTree_score = get_score_acc_rec_prec(dTree)
print_score(dTree_score,'all')
rForest = RandomForestClassifier(random_state=1)
rForest.fit(x_train,y_train)
make_cm(rForest,y_test)
rForest_score = get_score_acc_rec_prec(rForest)
print_score(rForest_score,'all')
bClassifier = BaggingClassifier(random_state=1)
bClassifier.fit(x_train,y_train)
make_cm(bClassifier,y_test)
bClassifier_score = get_score_acc_rec_prec(bClassifier)
print_score(bClassifier_score,'all')
adaBoost = AdaBoostClassifier(random_state=1)
adaBoost.fit(x_train,y_train)
make_cm(adaBoost,y_test)
adaBoost_score = get_score_acc_rec_prec(adaBoost)
print_score(adaBoost_score,'all')
gBoost = GradientBoostingClassifier(random_state=1)
gBoost.fit(x_train,y_train)
make_cm(gBoost,y_test)
gBoost_score = get_score_acc_rec_prec(gBoost)
print_score(gBoost_score,'all')
xgBoost = XGBClassifier(random_state=1, eval_metric='logloss')
xgBoost.fit(x_train,y_train)
make_cm(xgBoost,y_test)
xgBoost_score = get_score_acc_rec_prec(xgBoost)
print_score(xgBoost_score,'all')
#list of all the models
modelList = [lg,dTree,rForest,bClassifier,adaBoost,gBoost,xgBoost]
acc_train = []
acc_test = []
rec_train = []
rec_test = []
prec_train = []
prec_test = []
for model in modelList:
score = get_score_acc_rec_prec(model)
acc_train.append(score[0])
acc_test.append(score[1])
rec_train.append(score[2])
rec_test.append(score[3])
prec_train.append(score[4])
prec_test.append(score[5])
comparison_frame = pd.DataFrame({'Model':['Logistic Regression','Decision Tree','Random Forest','Bagging Classifier',
'AdaBoost Classifier','Gradient Boosting Classifier','XGBoost Classifier'],
'TrainAccuracy': acc_train,'TestAccuracy': acc_test,
'TrainRecall': rec_train,'TestRecall': rec_test,
'TrainPrecision': prec_train,'TestPrecision': prec_test})
comparison_frame
Choose Random Forest, AdaBosting, xg Boosting.
(Precision is more important in this case)
Reasons:
Want to see the Hyperparameter tuning for Random Forest and see if it can drastically improve.
xgboosting and adaboosting has higher performance without clear overfitting.
param_grid_rForest = {"max_depth": [2, None],
"max_features": [1, 3, 10],
"min_samples_split": [2, 3, 8],
"min_samples_leaf": [1, 3, 7],
"bootstrap": [True, False],
"criterion": ["gini", "entropy"]}
%%time
grid_search_rForest = GridSearchCV(rForest, param_grid=param_grid_rForest)
grid_search_rForest.fit(X, y)
print(f"Best Parameters:{grid_search_rForest.best_params_} \nScore: {grid_search_rForest.best_score_}")
#grid_search_rForest.cv_results_['mean_test_score']
#grid_search_rForest.best_estimator_
param_dist_rForest = {"max_depth": [2, None],
"max_features": sp_randint(1, 12),
"min_samples_split": sp_randint(2, 12),
"min_samples_leaf": sp_randint(1, 12),
"bootstrap": [True, False],
"criterion": ["gini", "entropy"]}
%%time
random_search_rForest= RandomizedSearchCV(rForest, param_distributions=param_dist_rForest, n_iter=10) #default cv = 3
random_search_rForest.fit(X, y)
print(f"Best Parameters:{random_search_rForest.best_params_} \nScore: {random_search_rForest.best_score_}")
%%time
pipeline = make_pipeline(StandardScaler(), AdaBoostClassifier(random_state=1))
param_grid_adaBoost = {
"adaboostclassifier__n_estimators": np.arange(10, 80, 10),
"adaboostclassifier__learning_rate": [0.01,0.05,0.1,0.3,1],
"adaboostclassifier__base_estimator": [
DecisionTreeClassifier(max_depth=1, random_state=1),
DecisionTreeClassifier(max_depth=2, random_state=1),
DecisionTreeClassifier(max_depth=3, random_state=1),
],
}
# Use Accuracy since both recall and precision are equally important in reality.
scorer_adaBoost = metrics.make_scorer(metrics.accuracy_score)
grid_search_adaBoost = GridSearchCV(estimator=pipeline, param_grid=param_grid_adaBoost,
scoring=scorer_adaBoost, cv=5, n_jobs = -1)
grid_search_adaBoost.fit(x_train, y_train)
print(f"Best Parameters:{grid_search_adaBoost.best_params_} \nScore: {grid_search_adaBoost.best_score_}")
%%time
pipeline1 = make_pipeline(StandardScaler(), AdaBoostClassifier(random_state=1))
param_dist_adaBoost = {
"adaboostclassifier__n_estimators": np.arange(10, 80, 10),
"adaboostclassifier__learning_rate": [0.01,0.05,0.1,0.3,1],
"adaboostclassifier__base_estimator": [
DecisionTreeClassifier(max_depth=1, random_state=1),
DecisionTreeClassifier(max_depth=2, random_state=1),
DecisionTreeClassifier(max_depth=3, random_state=1),
],
}
# Use Accuracy since both recall and precision are equally important in reality.
scorer_adaBoost1 = metrics.make_scorer(metrics.accuracy_score)
random_search_adaBoost = RandomizedSearchCV(estimator=pipeline1, param_distributions=param_dist_adaBoost, n_iter=50,
scoring=scorer_adaBoost1, cv=5, random_state=1)
random_search_adaBoost.fit(x_train,y_train)
print(f"Best parameters are {random_search_adaBoost.best_params_} \nscore={random_search_adaBoost.best_score_}:")
%%time
pipeline2=make_pipeline(StandardScaler(), XGBClassifier(random_state=1,eval_metric='logloss'))
param_grid_XGBoost={
'xgbclassifier__n_estimators':np.arange(50,250,50),'xgbclassifier__scale_pos_weight':[0,1,3,7],
'xgbclassifier__learning_rate':[0.01,0.05,0.1,0.2], 'xgbclassifier__gamma':[0,2,5],
'xgbclassifier__subsample':[0.7,0.85,1]
}
scorer_XGBoost = metrics.make_scorer(metrics.accuracy_score)
grid_search_XGBoost = GridSearchCV(estimator=pipeline2, param_grid=param_grid_XGBoost,
scoring=scorer_XGBoost, cv=5, n_jobs = -1)
grid_search_XGBoost.fit(x_train,y_train)
print(f"Best parameters are {grid_search_XGBoost.best_params_} \nscore={grid_search_XGBoost.best_score_}")
%%time
pipeline3=make_pipeline(StandardScaler(),XGBClassifier(random_state=1,eval_metric='logloss', n_estimators = 50))
#Parameter grid to pass in RandomizedSearchCV
param_dist_XGBoost={
'xgbclassifier__n_estimators':np.arange(50,250,50),
'xgbclassifier__scale_pos_weight':[0,1,3,7],
'xgbclassifier__learning_rate':[0.01,0.05,0.1,0.2],
'xgbclassifier__gamma':[0,2,5],
'xgbclassifier__subsample':[0.7,0.85,1],
'xgbclassifier__max_depth':np.arange(1,8,1),
'xgbclassifier__reg_lambda':[0,1,3,7]
}
scorer_XGBoost1 = metrics.make_scorer(metrics.accuracy_score)
random_search_XGBoost = RandomizedSearchCV(estimator=pipeline3, param_distributions=param_dist_XGBoost, n_iter=50,
scoring=scorer_XGBoost1, cv=5, random_state=1)
random_search_XGBoost.fit(x_train,y_train)
print(f"Best parameters are {random_search_XGBoost.best_params_} with CV score={random_search_XGBoost.best_score_}:")
It seems like the XGBoost has the best accuracy. However, it is very important to know, with big data, the time complexity can be very costly. Recommend using the second best model to update more often. (24min XGBoost with <1% higher accuracy vs 2 min adaboost with good enough accuracy)
Since the purpose of the project is to find the customers who are likely to stop using credit card, it is also important to check the customers' behaviors. From EDA, it became more clear that:
Consider the following business recommendations: